package com.symria.system.test;

import java.util.ArrayList;
import java.util.HashMap;  
import java.util.Iterator;  
import java.util.List;  
import java.util.Map;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  

public class ParseContent {
    // 过滤<a></a>标签  
    private static String regxpForATag = "<\\s*a\\s.*?href\\s*=\\s*[^>]*\\s*>\\s*(.*?)\\s*<\\s*/\\s*a\\s*>";  
    private static Pattern patternHref = Pattern.compile(regxpForATag,  
            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);  
    private static Matcher matcherHref = null;  
  
    // 过滤Img标签  
    private static String regxpForImgTag = "<\\s*img\\s*([^>]*)\\s*/?\\s*>";  
    private static Pattern patternImg = Pattern.compile(regxpForImgTag,  
            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);  
  
    // 过滤Img标签下的src  
    private static final Pattern patternImgStr = Pattern.compile(  
            "<\\s*img\\s*(?:[^>]*)src\\s*=\\s*([^>]+)",  
            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);  
  
    // 过滤Javascript标签  
    // "<\\s*script\\s*.*[^>]*\\s*>\\s*(.*?)\\s*<\\s*/\\s*script\\s*>"  
    // "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"  
    private static String regxpForJsTag = "<\\s*script\\s*[^>]*?\\s*>\\s*[\\s\\S]*?\\s*<\\s*/\\s*script\\s*?>";  
    private static Pattern patternJs = Pattern.compile(regxpForJsTag,  Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);  
  
    // 过滤标签标签下的class  
    private static final Pattern patternStyleClass = Pattern.compile(  
            "<\\s*.*(class\\s*=\\s*[^>]+)?\\s*>", Pattern.CASE_INSENSITIVE  
                    | Pattern.MULTILINE);  
  
    // 过滤link标签  
    private static final Pattern patternLink = Pattern.compile(  
            "<\\s*link\\s.*?href\\s*=\\s*[^>]*\\s*/?\\s*>",  
            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);  
  
    // 过滤style标签  
    private static final Pattern patternStyle = Pattern.compile(  
            "<\\s*style\\s*[^>]*?\\s*>\\s*[\\s\\S]*?\\s*<\\s*/\\s*style\\s*?>",  
            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);  
  
    private ParseContent() {  
    }  
  
    /** 
     * 1、<a><img /></a> 
     *  
     * @param xmlString 
     *             
     * @return 
     */  
    private static String filterdAdvertising(String xmlString) {  
        // matcher = pattern  
        // .matcher("<img src='' >dsgsdgfsd<a href = ' ' ><imG src='sdfsdf'/>< /  
        // A >sdfgsdhf < A hrEf = \" dsd\" > sdgsdfgsdg</a>1212121 < img src = '  
        // ' / >< a href ='href'><img href='hhhhhh' />sdfsfd dfdf</a>1sdfsdf1");  
  
        List<String> advertisingList = null;  
        String returnString = xmlString;  
        if (xmlString != null && !"".equals(returnString)) {  
            matcherHref = patternHref.matcher(xmlString);  
            advertisingList = new ArrayList<String>();  
  
            while (matcherHref.find()) {  
                String hrefString = matcherHref.group();  
                Matcher matcherImg = patternImg.matcher(hrefString);  
                while (matcherImg.find()) {  
                    advertisingList.add(matcherHref.group());  
                }  
            }  
        }  
        // System.out.println(returnString);  
        if (advertisingList != null && advertisingList.size() > 0) {  
            for (String string : advertisingList) {  
                // System.out.println(string);  
                returnString = returnString.replace(string, "$$");  
                // System.out.println(returnString);  
                // System.out.println("############################");  
            }  
        }  
        return returnString;  
    }  
  
   
  
    /** 
     * 过滤Img下的src 
     *  
     * @param imgString 
     * @return 
     */  
    private static List<String> getImgSrc(String imgString) {  
        Matcher matcher = patternImgStr.matcher(imgString);  
        List<String> list = new ArrayList<String>();  
        while (matcher.find()) {  
            String group = matcher.group(1);  
            if (group == null) {  
                continue;  
            }  
            // 这里可能还需要更复杂的判断,用以处理src="...."内的一些转义符  
            if (group.startsWith("'")) {  
                list.add(group.substring(1, group.indexOf("'", 1)));  
            } else if (group.startsWith("\"")) {  
                list.add(group.substring(1, group.indexOf("\"", 1)));  
            } else {  
                list.add(group.split("\\s")[0]);  
            }  
        }  
        // for (String string : list) {  
        // System.out.println(string);  
        // }  
        return list;  
    }  
  
    /** 
     * 过滤Img下的src 
     *  
     * @param imgString 
     * @return 
     */  
    private static String filterImgSrc(String imgString) {  
        Matcher matcher = patternImgStr.matcher(imgString);  
        String returnString = null;  
        while (matcher.find()) {  
            String group = matcher.group(1);  
            if (group == null) {  
                continue;  
            }  
            // 这里可能还需要更复杂的判断,用以处理src="...."内的一些转义符  
            if (group.startsWith("'")) {  
                returnString = group.substring(1, group.indexOf("'", 1));  
            } else if (group.startsWith("\"")) {  
                returnString = group.substring(1, group.indexOf("\"", 1));  
            } else {  
                returnString = group.split("\\s")[0];  
            }  
        }  
        return returnString;  
    }  
  
    /** 
     * 过滤掉Javascript 
     *  
     * @param contentString 
     * @return 
     */  
    private static String filterScript(String contentString) {  
        String returnString = contentString;  
        Matcher matcher = patternJs.matcher(returnString);  
        while (matcher.find()) {  
            String group = matcher.group();  
            if (group == null) {  
                continue;  
            }  
            returnString = returnString.replace(group, "$$");  
        }  
        // System.out.println(returnString);  
        return returnString;  
    }  
  
    /** 
     * 过滤掉样式class="" 
     *  
     * @param contentString 
     * @return 
     */  
    private static String filterStyleClass(String contentString) {  
        String returnString = contentString;  
  
        Pattern patternStyleClass = Pattern.compile(  
                "(\\s*class\\s*=\\s*[\"|\'](.*?)[\"|\']\\s*?)",  
                Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);  
        Matcher matcher = patternStyleClass.matcher(returnString);  
        while (matcher.find()) {  
            String group = matcher.group();  
            if (group == null) {  
                continue;  
            }  
            // System.out.println(group);  
            returnString = returnString.replace(group, "$$");  
        }  
        // System.out.println(returnString);  
        return returnString;  
    }  
  
    /** 
     * 过滤link标签 
     *  
     * @param contentString 
     * @return 
     */  
    public static String filterLink(String contentString) {  
        String returnString = contentString;  
        Matcher matcher = patternLink.matcher(returnString);  
        while (matcher.find()) {  
            String group = matcher.group();  
            if (group == null) {  
                continue;  
            }  
            // System.out.println(group);  
            returnString = returnString.replace(group, "$$");  
        }  
        // System.out.println(returnString);  
        return returnString;  
    }  
  
    /** 
     * 过滤style标签 
     *  
     * @param contentString 
     * @return 
     */  
    public static String filterStyle(String contentString) {  
        String returnString = contentString;  
        Matcher matcher = patternStyle.matcher(returnString);  
        while (matcher.find()) {  
            String group = matcher.group();  
            if (group == null) {  
                continue;  
            }  
            // System.out.println(group);  
            returnString = returnString.replace(group, "$$");  
        }  
        // System.out.println(returnString);  
        return returnString;  
    }  
  
    public static void main(String[] args) {  
        String contentString = "ddddd<style class=\"testsdfsdfsfdsgdsghdfs\" sdf sdf  sdf ></style><div style='display:none'>sdf</div><link class='1234' href='东四饭店sgs对公' > <style href=''>ddd</style><link href='javascript:function (){}' ></div>的闪光灯<tr class=\"hidsdy\"></tr> <style href=''>ddd</style>";  
        System.out.println(filterStyle(contentString));  
    }  
  
    /** 
     * 过滤全部 
     *  
     * @param contentString 
     * @param savepath 
     * @return 
     */  
    public static String filterContent(String contentString, String savepath)  
            throws Exception {  
        String returnString = contentString;  
        // System.out.println(returnString);  
        //1、过滤广告  
        returnString = filterdAdvertising(returnString);  
        //2、过滤图片  
        //3、过滤script  
        returnString = filterScript(returnString);  
        //4、过滤link  
        returnString = filterLink(returnString);  
        //5、过滤Style  
        returnString = filterStyle(returnString);  
        //6、过滤class  
        returnString = filterStyleClass(returnString);  
        // System.out.println("############################");  
        // System.out.println(returnString);  
        return returnString;  
    }  
}
